In this document we analyze the PASS1B proteomics animal data of MoTrPAC. The flow steps are: (1) load the data from the google cloud bucket, (2) preprocess each dataset, (3) PCA analysis and correlation with metadata variables, and (4) flagging potential outliers, (5) removing outliers as revised by the proteomics groups, (6) perform differential abundance analysis.

# CONFIGURATIONS
# Specify the parameters required for the analysis

# This directory should have the MotrpacRatTraining6moQCRep repository
# clone from: https://github.com/MoTrPAC/MotrpacRatTraining6moQCRep
repo_local_dir = "~/github/MoTrPAC/MotrpacRatTraining6moQCRep"

# Runnable command for gsutil
gsutil_cmd = "~/bin/google-cloud-sdk/bin/gsutil"

# These paths were removed for security reasons, see the repo's readme
bucket = "ADDPATH"

# Where should the data be downloaded to
download_from_bucket = FALSE # If FALSE, it assumes the data is there
local_data_dir = "ADDPATH"

# where to print the data and dea tables
upload_to_bucket = FALSE # TRUE upload the data to the bucket
out_bucket = "ADDPATH"
dea_folder_name = "dea"
suffix = "20210914.txt"

# Specify bucket and local path for the phenotypic data
pheno_bucket = "ADDPATH"
pheno_local_dir = "ADDPATH"

# specify parameters for filtering by missing values
max_na_freq = 0.67
# Should PCA use imputed data?
pca_using_impute=FALSE

# Specify how many PCs to examine
num_pcs = 5
num_pcs_for_outlier_analysis = 3
# Specify the significance level for association analysis
# (e.g., between a PC and a clinical variable)
p_thr = 0.0001

Specify the pipeline, metadata, and sample variables for the analysis.

# Define technical and biological variables to be considered in the QC
pipeline_qc_cols = c("tmt_plex","num_NA")
biospec_cols = c("registration.sex",
                 "key.anirandgroup",
                 "registration.batchnumber",
                 "training.staffid",
                 "is_control",
                 "vo2.max.test.vo2_max_visit1", # this assumes that visit1's are aligned
                 "terminal.weight.mg",
                 "time_to_freeze",
                 "timepoint",
                 "bid",
                 "pid")
# load functions and libraries
source(paste0(repo_local_dir,"/tools/unsupervised_normalization_functions.R"))
source(paste0(repo_local_dir,"/tools/gcp_functions.R"))
source(paste0(repo_local_dir,"/tools/qc_report_aux_functions.R"))
source(paste0(repo_local_dir,"/tools/config_session.R"))
source(paste0(repo_local_dir,"/tools/association_analysis_methods.R"))
source(paste0(repo_local_dir,"/tools/ptm_protein_correction.R"))

1. Load data

1.1 Download data

if(download_from_bucket){
  obj = download_bucket_to_local_dir(bucket,
                                     local_path=path.expand(local_data_dir),
                                     remove_prev_files = TRUE,
                                     GSUTIL_PATH=gsutil_cmd)
}

1.2 Parse proteomics results

The code belows take the metadata and ration tables. For raw intensities (rii) - specify the matrix regex in “parse_proteomics_datasets_from_download_obj”.

obj = list(
  downloaded_files = list.files(local_data_dir,full.names = TRUE,recursive = TRUE),
  local_path = local_data_dir
)

proteomics_data = parse_proteomics_datasets_from_download_obj(
  obj,local_path=local_data_dir,remove_prev_files = TRUE,GSUTIL_PATH=gsutil_cmd,
  platforms = c("prot-ph","prot-pr","prot-ac", "prot-ub")
)
## [1] "--------------------------------------------------------------------"
## [1] "             analyzing dataset: t53-cortex,prot-ph"
## [1] "Using the annotation columns in the data file: motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/proteomics/t53-cortex/prot-ph/motrpac_pass1b-06_t53-cortex_prot-ph_ratio-results.txt"
## [1] "Column names in the row annotation data:"
##  [1] "protein_id"        "gene_symbol"       "entrez_id"        
##  [4] "redundant_ids"     "organism_name"     "is_contaminant"   
##  [7] "ptm_id"            "confident_score"   "confident_site"   
## [10] "flanking_sequence" "ptm_score"        
## [1] "using ptm_id as the feature name"
## [1] "--------------------------------------------------------------------"
## [1] "             analyzing dataset: t53-cortex,prot-pr"
## [1] "Using the annotation columns in the data file: motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/proteomics/t53-cortex/prot-pr/motrpac_pass1b-06_t53-cortex_prot-pr_ratio-results.txt"
## [1] "Column names in the row annotation data:"
## [1] "protein_id"       "gene_symbol"      "entrez_id"        "redundant_ids"   
## [5] "organism_name"    "is_contaminant"   "num_peptides"     "percent_coverage"
## [9] "protein_score"   
## [1] "using protein id as feature name"
## [1] "--------------------------------------------------------------------"
## [1] "             analyzing dataset: t55-gastrocnemius,prot-ph"
## [1] "Using the annotation columns in the data file: motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/proteomics/t55-gastrocnemius/prot-ph/motrpac_pass1b-06_t55-gastrocnemius_prot-ph_ratio-results.txt"
## [1] "Column names in the row annotation data:"
##  [1] "protein_id"        "gene_symbol"       "entrez_id"        
##  [4] "redundant_ids"     "organism_name"     "is_contaminant"   
##  [7] "ptm_id"            "confident_score"   "confident_site"   
## [10] "flanking_sequence" "ptm_score"        
## [1] "using ptm_id as the feature name"
## [1] "--------------------------------------------------------------------"
## [1] "             analyzing dataset: t55-gastrocnemius,prot-pr"
## [1] "Using the annotation columns in the data file: motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/proteomics/t55-gastrocnemius/prot-pr/motrpac_pass1b-06_t55-gastrocnemius_prot-pr_ratio-results.txt"
## [1] "Column names in the row annotation data:"
## [1] "protein_id"       "gene_symbol"      "entrez_id"        "redundant_ids"   
## [5] "organism_name"    "is_contaminant"   "num_peptides"     "percent_coverage"
## [9] "protein_score"   
## [1] "using protein id as feature name"
## [1] "--------------------------------------------------------------------"
## [1] "             analyzing dataset: t58-heart,prot-ph"
## [1] "Using the annotation columns in the data file: motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/proteomics/t58-heart/prot-ph/motrpac_pass1b-06_t58-heart_prot-ph_ratio-results.txt"
## [1] "Column names in the row annotation data:"
##  [1] "protein_id"        "gene_symbol"       "entrez_id"        
##  [4] "redundant_ids"     "organism_name"     "is_contaminant"   
##  [7] "ptm_id"            "confident_score"   "confident_site"   
## [10] "flanking_sequence" "ptm_score"        
## [1] "using ptm_id as the feature name"
## [1] "--------------------------------------------------------------------"
## [1] "             analyzing dataset: t58-heart,prot-pr"
## [1] "Using the annotation columns in the data file: motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/proteomics/t58-heart/prot-pr/motrpac_pass1b-06_t58-heart_prot-pr_ratio-results.txt"
## [1] "Column names in the row annotation data:"
## [1] "protein_id"       "gene_symbol"      "entrez_id"        "redundant_ids"   
## [5] "organism_name"    "is_contaminant"   "num_peptides"     "percent_coverage"
## [9] "protein_score"   
## [1] "using protein id as feature name"
## [1] "--------------------------------------------------------------------"
## [1] "             analyzing dataset: t58-heart,prot-ac"
## [1] "Using the annotation columns in the data file: motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/proteomics/t58-heart/prot-ac/motrpac_pass1b-06_t58-heart_prot-ac_ratio-results.txt"
## [1] "Column names in the row annotation data:"
##  [1] "protein_id"        "gene_symbol"       "entrez_id"        
##  [4] "redundant_ids"     "organism_name"     "is_contaminant"   
##  [7] "ptm_id"            "confident_score"   "confident_site"   
## [10] "flanking_sequence" "ptm_score"        
## [1] "using ptm_id as the feature name"
## [1] "--------------------------------------------------------------------"
## [1] "             analyzing dataset: t58-heart,prot-ub"
## [1] "Using the annotation columns in the data file: motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/proteomics/t58-heart/prot-ub/motrpac_pass1b-06_t58-heart_prot-ub_ratio-results.txt"
## [1] "Column names in the row annotation data:"
##  [1] "protein_id"        "gene_symbol"       "entrez_id"        
##  [4] "redundant_ids"     "organism_name"     "is_contaminant"   
##  [7] "ptm_id"            "confident_score"   "confident_site"   
## [10] "flanking_sequence" "ptm_score"        
## [1] "using ptm_id as the feature name"
## [1] "--------------------------------------------------------------------"
## [1] "             analyzing dataset: t59-kidney,prot-ph"
## [1] "Using the annotation columns in the data file: motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/proteomics/t59-kidney/prot-ph/motrpac_pass1b-06_t59-kidney_prot-ph_ratio-results.txt"
## [1] "Column names in the row annotation data:"
##  [1] "protein_id"        "gene_symbol"       "entrez_id"        
##  [4] "redundant_ids"     "organism_name"     "is_contaminant"   
##  [7] "ptm_id"            "confident_score"   "confident_site"   
## [10] "flanking_sequence" "ptm_score"        
## [1] "using ptm_id as the feature name"
## [1] "--------------------------------------------------------------------"
## [1] "             analyzing dataset: t59-kidney,prot-pr"
## [1] "Using the annotation columns in the data file: motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/proteomics/t59-kidney/prot-pr/motrpac_pass1b-06_t59-kidney_prot-pr_ratio-results.txt"
## [1] "Column names in the row annotation data:"
## [1] "protein_id"       "gene_symbol"      "entrez_id"        "redundant_ids"   
## [5] "organism_name"    "is_contaminant"   "num_peptides"     "percent_coverage"
## [9] "protein_score"   
## [1] "using protein id as feature name"
## [1] "--------------------------------------------------------------------"
## [1] "             analyzing dataset: t66-lung,prot-ph"
## [1] "Using the annotation columns in the data file: motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/proteomics/t66-lung/prot-ph/motrpac_pass1b-06_t66-lung_prot-ph_ratio-results.txt"
## [1] "Column names in the row annotation data:"
##  [1] "protein_id"        "gene_symbol"       "entrez_id"        
##  [4] "redundant_ids"     "organism_name"     "is_contaminant"   
##  [7] "ptm_id"            "confident_score"   "confident_site"   
## [10] "flanking_sequence" "ptm_score"        
## [1] "using ptm_id as the feature name"
## [1] "--------------------------------------------------------------------"
## [1] "             analyzing dataset: t66-lung,prot-pr"
## [1] "Using the annotation columns in the data file: motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/proteomics/t66-lung/prot-pr/motrpac_pass1b-06_t66-lung_prot-pr_ratio-results.txt"
## [1] "Column names in the row annotation data:"
## [1] "protein_id"       "gene_symbol"      "entrez_id"        "redundant_ids"   
## [5] "organism_name"    "is_contaminant"   "num_peptides"     "percent_coverage"
## [9] "protein_score"   
## [1] "using protein id as feature name"
## [1] "--------------------------------------------------------------------"
## [1] "             analyzing dataset: t68-liver,prot-ph"
## [1] "Using the annotation columns in the data file: motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/proteomics/t68-liver/prot-ph/motrpac_pass1b-06_t68-liver_prot-ph_ratio-results.txt"
## [1] "Column names in the row annotation data:"
##  [1] "protein_id"        "gene_symbol"       "entrez_id"        
##  [4] "redundant_ids"     "organism_name"     "is_contaminant"   
##  [7] "ptm_id"            "confident_score"   "confident_site"   
## [10] "flanking_sequence" "ptm_score"        
## [1] "using ptm_id as the feature name"
## [1] "--------------------------------------------------------------------"
## [1] "             analyzing dataset: t68-liver,prot-pr"
## [1] "Using the annotation columns in the data file: motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/proteomics/t68-liver/prot-pr/motrpac_pass1b-06_t68-liver_prot-pr_ratio-results.txt"
## [1] "Column names in the row annotation data:"
## [1] "protein_id"       "gene_symbol"      "entrez_id"        "redundant_ids"   
## [5] "organism_name"    "is_contaminant"   "num_peptides"     "percent_coverage"
## [9] "protein_score"   
## [1] "using protein id as feature name"
## [1] "--------------------------------------------------------------------"
## [1] "             analyzing dataset: t68-liver,prot-ac"
## [1] "Using the annotation columns in the data file: motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/proteomics/t68-liver/prot-ac/motrpac_pass1b-06_t68-liver_prot-ac_ratio-results.txt"
## [1] "Column names in the row annotation data:"
##  [1] "protein_id"        "gene_symbol"       "entrez_id"        
##  [4] "redundant_ids"     "organism_name"     "is_contaminant"   
##  [7] "ptm_id"            "confident_score"   "confident_site"   
## [10] "flanking_sequence" "ptm_score"        
## [1] "using ptm_id as the feature name"
## [1] "--------------------------------------------------------------------"
## [1] "             analyzing dataset: t68-liver,prot-ub"
## [1] "Using the annotation columns in the data file: motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/proteomics/t68-liver/prot-ub/motrpac_pass1b-06_t68-liver_prot-ub_ratio-results.txt"
## [1] "Column names in the row annotation data:"
##  [1] "protein_id"        "gene_symbol"       "entrez_id"        
##  [4] "redundant_ids"     "organism_name"     "is_contaminant"   
##  [7] "ptm_id"            "confident_score"   "confident_site"   
## [10] "flanking_sequence" "ptm_score"        
## [1] "using ptm_id as the feature name"
## [1] "--------------------------------------------------------------------"
## [1] "             analyzing dataset: t70-white-adipose,prot-ph"
## [1] "Using the annotation columns in the data file: motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/proteomics/t70-white-adipose/prot-ph/motrpac_pass1b-06_t70-white-adipose_prot-ph_ratio-results.txt"
## [1] "Column names in the row annotation data:"
##  [1] "protein_id"        "gene_symbol"       "entrez_id"        
##  [4] "redundant_ids"     "organism_name"     "is_contaminant"   
##  [7] "ptm_id"            "confident_score"   "confident_site"   
## [10] "flanking_sequence" "ptm_score"        
## [1] "using ptm_id as the feature name"
## [1] "--------------------------------------------------------------------"
## [1] "             analyzing dataset: t70-white-adipose,prot-pr"
## [1] "Using the annotation columns in the data file: motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/proteomics/t70-white-adipose/prot-pr/motrpac_pass1b-06_t70-white-adipose_prot-pr_ratio-results.txt"
## [1] "Column names in the row annotation data:"
## [1] "protein_id"       "gene_symbol"      "entrez_id"        "redundant_ids"   
## [5] "organism_name"    "is_contaminant"   "num_peptides"     "percent_coverage"
## [9] "protein_score"   
## [1] "using protein id as feature name"
failed_datasets = proteomics_data$failed_datasets
proteomics_data = proteomics_data$proteomics_parsed_datasets

#this is commented out: the printed buckets here will refer to the "ac" datasets,
# and these were not submitted for this release.
if(length(failed_datasets)>1){
  print("Assays/data not available (or failed):")
  local_fname = file.path(local_data_dir, paste0("log-pass1b-06_datasets-not-processed_",suffix))
  write.table(failed_datasets, local_fname, row.names = FALSE,quote=FALSE,sep="\t",col.names = FALSE)
}
## [1] "Assays/data not available (or failed):"

1.3 Download and parse phenotypic data

# Add the pheno data
pheno_data = parse_pheno_data(pheno_bucket,
                              local_path = path.expand(pheno_local_dir),
                              remove_prev_files = TRUE,
                              GSUTIL_PATH=gsutil_cmd)
# add a tissue variable (for convinience)
pheno_data$viallabel_data$tissue = 
  pheno_data$viallabel_data$specimen.processing.sampletypedescription
# add the time to freeze variable ((for convinience))
pheno_data$viallabel_data$time_to_freeze = 
  pheno_data$viallabel_data$calculated.variables.frozetime_after_train - 
  pheno_data$viallabel_data$calculated.variables.deathtime_after_train

# some freeze times are negative because the tissues were taken
# before sacrifice time, zero these cases
pheno_data$viallabel_data$time_to_freeze = pmax(0,pheno_data$viallabel_data$time_to_freeze)

# add a binary is_control variable
pheno_data$viallabel_data$is_control = as.numeric(grepl("control",
  pheno_data$viallabel_data$key.anirandgroup,ignore.case = T))
# add the timepoint as a number
# x - the text description of the group
get_numeric_timepoint<-function(x){
  v = rep(0,length(x))
  tps = c("Eight"=8,"Four"=4,"One"=1,"Two"=2)
  for(tp in names(tps)){
    v[grepl(paste0(tp,"-week"),x,perl = T,ignore.case = T)] = tps[tp]
  }
  return(v)
}
pheno_data$viallabel_data$timepoint = get_numeric_timepoint(
  pheno_data$viallabel_data$key.anirandgroup
)

2 Normalization and processing

2.1 Normalize and filter the data

  • Median mad normalization
  • Remove batch effect
  • Impute (for PCA)

Note: We use limma’s removeBatchEffect to account for plex variation. We also keep a version of the data with imputed missing values. The new data objects are saved into the proteomics_data list.

for(dataset_name in names(proteomics_data)){
  print(paste("processing",dataset_name))
  curr_data  = proteomics_data[[dataset_name]]$sample_data
  # median value normalization
  sample_meta = proteomics_data[[dataset_name]]$sample_meta[colnames(curr_data),]
  
  # Exclude features with many NAs
  row_NA_data = rowSums(is.na(curr_data))/ncol(curr_data)
  feature_inds = row_NA_data < max_na_freq
  # filter and normalize
  norm_data = as.matrix(curr_data[feature_inds,])
  norm_data = run_median_mad_norm(norm_data)
  # b for batch corrected
  norm_data_b = limma::removeBatchEffect(norm_data,sample_meta$tmt_plex)
  
  # Impute - use capture.output to avoid prints 
  # This will be used for PCA
  capture.output({
    norm_data_b_imp = impute::impute.knn(as.matrix(norm_data_b))$data
    norm_data_imp = impute::impute.knn(as.matrix(norm_data))$data
  },file=NULL)
  
  # Median normalization (per MOP)
  # norm_data_imp = run_median_mad_norm(norm_data_imp)
  if(any(is.na(norm_data_imp))){
    print(paste("Imputation failed in:",dataset_name))
  }
  
  proteomics_data[[dataset_name]][["norm_filtered"]] = list(
    norm_data_b = norm_data_b, # normalized and batch corrected
    norm_data_b_imp = norm_data_b_imp, # normalizes, batch corrected and imputed
    norm_data = norm_data, # normalized only, no batch correction, no imputation
    norm_data_imp = norm_data_imp, # normalized with imputation, no batch correction
    feature_inds = feature_inds
  )
  print(paste0('done, objects saved into proteomics_data[["',
               dataset_name,'"]][[,norm_filtered]]'))
}

2.2 Examine normalization effects

Examine the effect of the normalization process, which includes removing features with many NAs. For each dataset show the boxplots of three randomly selected plexes.

for(i in 1:length(proteomics_data)){
  unnorm_data = proteomics_data[[i]]$sample_data
  norm_data = proteomics_data[[i]][["norm_filtered"]][[1]]
  plex = proteomics_data[[i]]$sample_meta[colnames(norm_data),"tmt_plex"]
  plex = as.factor(plex)
  # order the data by the plexes
  ord = order(plex)
  unnorm_data = unnorm_data[,ord]
  norm_data = norm_data[,ord]
  plex = plex[ord]
  # take a subset of the plexes
  samp_plex = sample(unique(plex))[1:3]
  samp = which(plex %in% samp_plex)
  # plot
  curr_name = names(proteomics_data)[i]
  colnames(unnorm_data) = NULL;colnames(norm_data) = NULL
  boxplot(unnorm_data[,samp],
          main=paste0(curr_name,"\nraw data (",nrow(unnorm_data)," features)"),
          ylab = "log2 ratio",xaxt="n",col=plex)
  boxplot(norm_data[,samp],
          main=paste0(curr_name,"\nnorm data (ratios) (",nrow(norm_data)," features)"),
          ylab = "log2 ratio",xaxt="n",col=plex)
}

2.3 PTM Protein-abundance correction

Correct PTM datasets with protein abundances. Most relevant for ubiquitylome data.

for(ptm_assay in c("prot-ub","prot-ac","prot-ph")){
  
  #Select the PTM assay that has to be corrected
  ptm_data_names <- grep(ptm_assay,names(proteomics_data),value = T)
  tissues <- str_extract(ptm_data_names,"(\\w|-)+")
  
  #Loop through all tissues in this PTM assay and perform protein correction
  #Protein corrected dataset is saved under norm_filtered$norm_data_b_protein_corrected
  for(tissue in tissues){
    ptm_name <- paste(tissue,ptm_assay,sep=",")
    prot_name <- paste(tissue,"prot-pr",sep=",")
    cat("Protein correction - Processing dataset",paste(tissue,ptm_assay,sep=","),"...")
    
    #Identify matching protein id and save to row_annot in the PTM dataset
    proteomics_data[[ptm_name]]$row_annot <- 
      ptm_protein_match(pr_data = proteomics_data[[prot_name]]$norm_filtered$norm_data_b, 
                        ptm_row_annot =  proteomics_data[[ptm_name]]$row_annot, 
                        prot_row_annot = proteomics_data[[prot_name]]$row_annot)
    
    #Perform protein correction of PTM data
    proteomics_data[[ptm_name]]$norm_filtered$norm_data_b_protein_corrected <-
      ptm_protein_correction(ptm_data = proteomics_data[[ptm_name]]$norm_filtered$norm_data_b, 
                             pr_data = proteomics_data[[prot_name]]$norm_filtered$norm_data_b, 
                             ptm_row_annot =  proteomics_data[[ptm_name]]$row_annot, 
                             prot_row_annot = proteomics_data[[prot_name]]$row_annot, 
                             plot_graphs = T)
    
    #Remove sites with many missing values as indicated by feature_inds
    proteomics_data[[ptm_name]]$norm_filtered$norm_data_b_protein_corrected <-
      proteomics_data[[ptm_name]]$norm_filtered$norm_data_b_protein_corrected[
        proteomics_data[[ptm_name]]$norm_filtered$feature_inds,]
  }
  
}
## Protein correction - Processing dataset t58-heart,prot-ub ...[1] "508991 points with proteome match out of 536760 (94.8%)."
## [1] "Fitting model..."
## [1] "Success."
## 
## Call:
## lm(formula = value.ptm ~ value.prot, data = data, na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.1175 -0.2021 -0.0101  0.1891  3.7684 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.0061779  0.0006305   9.799   <2e-16 ***
## value.prot  0.2503471  0.0052074  48.075   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3578 on 322107 degrees of freedom
##   (214651 observations deleted due to missingness)
## Multiple R-squared:  0.007124,   Adjusted R-squared:  0.007121 
## F-statistic:  2311 on 1 and 322107 DF,  p-value: < 2.2e-16

## Protein correction - Processing dataset t68-liver,prot-ub ...[1] "717459 points with proteome match out of 749400 (95.7%)."
## [1] "Fitting model..."
## [1] "Success."
## 
## Call:
## lm(formula = value.ptm ~ value.prot, data = data, na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6426 -0.2045 -0.0010  0.2039  3.8543 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.0037322  0.0006096   6.123 9.22e-10 ***
## value.prot  0.4603204  0.0017321 265.764  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3794 on 388692 degrees of freedom
##   (360706 observations deleted due to missingness)
## Multiple R-squared:  0.1538, Adjusted R-squared:  0.1538 
## F-statistic: 7.063e+04 on 1 and 388692 DF,  p-value: < 2.2e-16

## Protein correction - Processing dataset t58-heart,prot-ac ...[1] "330018 points with proteome match out of 345060 (95.6%)."
## [1] "Fitting model..."
## [1] "Success."
## 
## Call:
## lm(formula = value.ptm ~ value.prot, data = data, na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.3194 -0.1441 -0.0130  0.1299  6.2150 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.0037316  0.0007151   5.218 1.81e-07 ***
## value.prot  0.3775431  0.0050331  75.012  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3281 on 210856 degrees of freedom
##   (134202 observations deleted due to missingness)
## Multiple R-squared:  0.02599,    Adjusted R-squared:  0.02599 
## F-statistic:  5627 on 1 and 210856 DF,  p-value: < 2.2e-16

## Protein correction - Processing dataset t68-liver,prot-ac ...[1] "625382 points with proteome match out of 645354 (96.9%)."
## [1] "Fitting model..."
## [1] "Success."
## 
## Call:
## lm(formula = value.ptm ~ value.prot, data = data, na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.6010 -0.1683 -0.0005  0.1648  4.9365 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.0008948  0.0005488  -1.631    0.103    
## value.prot   0.6439628  0.0020414 315.459   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3479 on 403020 degrees of freedom
##   (242332 observations deleted due to missingness)
## Multiple R-squared:  0.198,  Adjusted R-squared:  0.198 
## F-statistic: 9.951e+04 on 1 and 403020 DF,  p-value: < 2.2e-16

## Protein correction - Processing dataset t53-cortex,prot-ph ...[1] "3616301 points with proteome match out of 4069260 (88.9%)."
## [1] "Fitting model..."
## [1] "Success."
## 
## Call:
## lm(formula = value.ptm ~ value.prot, data = data, na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5722 -0.1199  0.0055  0.1278  6.4964 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.0062784  0.0001909  -32.89   <2e-16 ***
## value.prot   0.1072188  0.0014144   75.81   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2515 on 1735626 degrees of freedom
##   (2333632 observations deleted due to missingness)
## Multiple R-squared:  0.0033, Adjusted R-squared:  0.003299 
## F-statistic:  5746 on 1 and 1735626 DF,  p-value: < 2.2e-16

## Protein correction - Processing dataset t55-gastrocnemius,prot-ph ...[1] "2097186 points with proteome match out of 2606340 (80.5%)."
## [1] "Fitting model..."
## [1] "Success."
## 
## Call:
## lm(formula = value.ptm ~ value.prot, data = data, na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.4267 -0.2236  0.0067  0.2339  4.6641 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.0148050  0.0004559  -32.48   <2e-16 ***
## value.prot   0.2157740  0.0017208  125.39   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4657 on 1048828 degrees of freedom
##   (1557510 observations deleted due to missingness)
## Multiple R-squared:  0.01477,    Adjusted R-squared:  0.01477 
## F-statistic: 1.572e+04 on 1 and 1048828 DF,  p-value: < 2.2e-16

## Protein correction - Processing dataset t58-heart,prot-ph ...[1] "2466530 points with proteome match out of 2906880 (84.9%)."
## [1] "Fitting model..."
## [1] "Success."
## 
## Call:
## lm(formula = value.ptm ~ value.prot, data = data, na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.4843 -0.1565  0.0008  0.1580  5.3862 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.002969   0.000254  -11.69   <2e-16 ***
## value.prot   0.208000   0.001678  123.99   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3192 on 1580107 degrees of freedom
##   (1326771 observations deleted due to missingness)
## Multiple R-squared:  0.009635,   Adjusted R-squared:  0.009635 
## F-statistic: 1.537e+04 on 1 and 1580107 DF,  p-value: < 2.2e-16

## Protein correction - Processing dataset t59-kidney,prot-ph ...[1] "2415147 points with proteome match out of 2771340 (87.1%)."
## [1] "Fitting model..."
## [1] "Success."
## 
## Call:
## lm(formula = value.ptm ~ value.prot, data = data, na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.5221 -0.1379  0.0024  0.1404  4.0082 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.0039386  0.0002571  -15.32   <2e-16 ***
## value.prot   0.2094322  0.0018962  110.45   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2658 on 1069265 degrees of freedom
##   (1702073 observations deleted due to missingness)
## Multiple R-squared:  0.01128,    Adjusted R-squared:  0.01128 
## F-statistic: 1.22e+04 on 1 and 1069265 DF,  p-value: < 2.2e-16

## Protein correction - Processing dataset t66-lung,prot-ph ...[1] "3609313 points with proteome match out of 4024980 (89.7%)."
## [1] "Fitting model..."
## [1] "Success."
## 
## Call:
## lm(formula = value.ptm ~ value.prot, data = data, na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5629 -0.1655  0.0125  0.1805  3.8742 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.0130596  0.0002311   -56.5   <2e-16 ***
## value.prot   0.1839376  0.0013530   135.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.315 on 1857487 degrees of freedom
##   (2167491 observations deleted due to missingness)
## Multiple R-squared:  0.009852,   Adjusted R-squared:  0.009851 
## F-statistic: 1.848e+04 on 1 and 1857487 DF,  p-value: < 2.2e-16

## Protein correction - Processing dataset t68-liver,prot-ph ...[1] "2726429 points with proteome match out of 3168240 (86.1%)."
## [1] "Fitting model..."
## [1] "Success."
## 
## Call:
## lm(formula = value.ptm ~ value.prot, data = data, na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.2918 -0.1742  0.0063  0.1814  5.6259 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.0051438  0.0002705  -19.02   <2e-16 ***
## value.prot   0.3876859  0.0012781  303.33   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3536 on 1708681 degrees of freedom
##   (1459557 observations deleted due to missingness)
## Multiple R-squared:  0.0511, Adjusted R-squared:  0.0511 
## F-statistic: 9.201e+04 on 1 and 1708681 DF,  p-value: < 2.2e-16

## Protein correction - Processing dataset t70-white-adipose,prot-ph ...[1] "2263871 points with proteome match out of 2644860 (85.6%)."
## [1] "Fitting model..."
## [1] "Success."
## 
## Call:
## lm(formula = value.ptm ~ value.prot, data = data, na.action = na.exclude)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.6080  -0.2169   0.0114   0.2356   5.9893 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.0034578  0.0004456  -7.759 8.55e-15 ***
## value.prot   0.6013227  0.0008119 740.671  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4597 on 1073525 degrees of freedom
##   (1571333 observations deleted due to missingness)
## Multiple R-squared:  0.3382, Adjusted R-squared:  0.3382 
## F-statistic: 5.486e+05 on 1 and 1073525 DF,  p-value: < 2.2e-16

3. Quality control

3.1 PCA before and after adjusting for batch

Run PCA analysis on the imputed data, with and without mitigating batch effects:

for(dataset_name in names(proteomics_data)){
  data_obj = proteomics_data[[dataset_name]]
  data_samples = colnames(data_obj[["norm_filtered"]][["norm_data_b"]])
  randgroup = pheno_data$viallabel_data[data_samples,"key.anirandgroup"]
  sex = pheno_data$viallabel_data[data_samples,"registration.sex"]
  sex[sex=="1"] = "F";sex[sex=="2"]="M"
  plex = data_obj$sample_meta[data_samples,"tmt_plex"]
  
  # pca after removing batch effects
  if(pca_using_impute){
    curr_data = data_obj[["norm_filtered"]][["norm_data_b_imp"]]
  }
  else{
    curr_data = data_obj[["norm_filtered"]][["norm_data_b"]]
    curr_data = curr_data[rowSums(is.na(curr_data))==0,]
  }
  curr_pca = prcomp(t(curr_data),scale. = T,center = T)
  curr_pcax = curr_pca$x[,1:num_pcs]
  explained_var = summary(curr_pca)[["importance"]][3,5]
  df = data.frame(curr_pcax,sex=sex,plex=plex,randgroup = randgroup)
  proteomics_data[[dataset_name]][["norm_filtered"]][["pca_b"]] = list(
    pcax = df,explained_var=explained_var
  )
  
  # pca without removing batch effects
  if(pca_using_impute){
    curr_data = data_obj[["norm_filtered"]][["norm_data_imp"]]
  }
  else{
    curr_data = data_obj[["norm_filtered"]][["norm_data"]]
    curr_data = curr_data[rowSums(is.na(curr_data))==0,]
  }
  curr_pca = prcomp(t(curr_data),scale. = T,center = T)
  curr_pcax = curr_pca$x[,1:num_pcs]
  explained_var = summary(curr_pca)[["importance"]][3,5]
  df = data.frame(curr_pcax,sex=sex,plex=plex,randgroup = randgroup)
  proteomics_data[[dataset_name]][["norm_filtered"]][["pca_w_b"]] = list(
    pcax = df,explained_var=explained_var
  )
}

PCA plots:

for(dataset_name in names(proteomics_data)){
  df = proteomics_data[[dataset_name]][["norm_filtered"]][["pca_w_b"]]$pcax
  p = ggplot(df) + 
      geom_point(aes(x=PC1, y=PC2,col=plex, shape=sex)) +
      ggtitle(paste(dataset_name,"before batch effects analysis",sep=": "))
  plot(p)
  df = proteomics_data[[dataset_name]][["norm_filtered"]][["pca_b"]]$pcax
  p = ggplot(df) + 
      geom_point(aes(x=PC1, y=PC2,col=plex, shape=sex)) +
      ggtitle(paste(dataset_name,"after batch effects analysis",sep=": "))
  plot(p)
}

3.2 PCA visualization of sex and group

We plot the top two PCs for each tissue, color and shape correspond to randomization group and sex.

for(dataset_name in names(proteomics_data)){
  df = proteomics_data[[dataset_name]][["norm_filtered"]][["pca_b"]]$pcax
  p = ggplot(df) + 
      geom_point(aes(x=PC1, y=PC2,col=randgroup, shape=sex)) +
      ggtitle(dataset_name)
  plot(p)
}

3.3 Correlations with clinical variables

Here we analyze the top principal components (5) in each tissue and compute their association with the selected variables above (e.g., the pipeline qc metrics). We use Spearman correlation and a linear test for significance. For each dataset we also add the correlations among the metadata variables. These two analyses should be interpreted together, as the analyzed variables are not independent.

pcs_vs_qc_var_report = c()
metadata_variable_assoc_report = c()
for(dataset_name in names(proteomics_data)){
  # Get the projected PCs
  curr_pcax = proteomics_data[[dataset_name]][["norm_filtered"]][["pca_b"]][[1]]
  curr_pcax = curr_pcax[,grepl("^PC",names(curr_pcax))]
  curr_pipeline_meta = proteomics_data[[dataset_name]]$sample_meta
  curr_data_samples = rownames(curr_pcax)
  curr_meta = pheno_data$viallabel_data[curr_data_samples,biospec_cols]
  # remove metadata variables with NAs
  curr_meta = curr_meta[,!apply(is.na(curr_meta),2,any)]
   # remove bid and pid
  curr_meta = curr_meta[,!grepl("pid|bid",names(curr_meta))]
  # Add batch
  curr_meta$plex = as.numeric(
    as.factor(curr_pipeline_meta[curr_data_samples,"tmt_plex"]))
  curr_meta_numeric_cols = sapply(curr_meta,is.numeric)
  # Remove zero sd columns
  curr_meta_numeric_cols[curr_meta_numeric_cols] =
    apply(curr_meta[,curr_meta_numeric_cols],2,sd)>0
  
  corrs = cor(curr_pcax,curr_meta[,curr_meta_numeric_cols],method="spearman")
  corrsp = pairwise_eval(
    curr_pcax,curr_meta[,curr_meta_numeric_cols],func=pairwise_association_wrapper,
    f=1,max_num_vals_for_discrete=8)
  
  # Some ggplots have to be printed to be shown in the notebook
  print(ggcorrplot(t(corrs),lab=TRUE,lab_size=2.5,hc.order = F) +
    ggtitle(dataset_name) +
    theme(plot.title = element_text(hjust = 0.5,size=20)))
  
  for(i in 1:nrow(corrsp)){
    for(j in 1:ncol(corrsp)){
      if(corrsp[i,j]>p_thr){next}
      pcs_vs_qc_var_report = rbind(pcs_vs_qc_var_report,
            c(dataset_name,
              rownames(corrsp)[i],colnames(corrsp)[j],corrs[i,j],corrsp[i,j])
            )
    }
  }
  colnames(pcs_vs_qc_var_report) = c("Dataset(tissue,site)","PC",
                                  "qc_metric","rho(spearman)","p-value")
  
  ####
  # compute correlations among the metadata variables
  ####
  corrs = cor(curr_meta[,curr_meta_numeric_cols],method="spearman")
  corrsp = pairwise_eval(
    curr_meta[,curr_meta_numeric_cols],func=pairwise_association_wrapper,
    f=1,max_num_vals_for_discrete=8)
  
  # Some ggplots have to be printed to be shown in the notebook
  print(ggcorrplot(corrs,lab=TRUE,lab_size=2.5,hc.order = T))
  
  for(n1 in rownames(corrsp)){
    for(n2 in rownames(corrsp)){
      if(n1==n2){break}
      if(n1 %in% biospec_cols &&
         n2 %in% biospec_cols) {next}
      if(corrsp[n1,n2]>p_thr){next}
      metadata_variable_assoc_report = rbind(metadata_variable_assoc_report,
            c(dataset_name,n1,n2,corrs[n1,n2],corrsp[n1,n2])
      )
    }
  }
  if(!is.null(dim(metadata_variable_assoc_report))){
    colnames(metadata_variable_assoc_report) = c(
      "Dataset(tissue,site)","Var1","Var2","rho(spearman)","p-value")
  }

}

# Format the reports - for a nicer presentation in a table
pcs_vs_qc_var_report[,5] = format(
  as.numeric(pcs_vs_qc_var_report[,5]),digits=3)
pcs_vs_qc_var_report[,4] = format(
  as.numeric(pcs_vs_qc_var_report[,4]),digits=3)
metadata_variable_assoc_report[,5] = format(
  as.numeric(metadata_variable_assoc_report[,5]),digits=3)
metadata_variable_assoc_report[,4] = format(
  as.numeric(metadata_variable_assoc_report[,4]),digits=3)

3.3.1 Summary of significant clinical variables

Table of clinical variables with significant correlations to the data

kable(pcs_vs_qc_var_report,longtable=TRUE,
      caption = "Correlations between PCs and other variables")  %>%
  kable_styling(latex_options = c("repeat_header"),font_size = 8)
Correlations between PCs and other variables
Dataset(tissue,site) PC qc_metric rho(spearman) p-value
t53-cortex,prot-ph PC1 time_to_freeze -0.582 2.98e-07
t55-gastrocnemius,prot-ph PC3 registration.sex 0.703 2.58e-09
t55-gastrocnemius,prot-ph PC3 training.staffid -0.551 1.86e-05
t55-gastrocnemius,prot-ph PC3 terminal.weight.mg 0.587 1.17e-07
t55-gastrocnemius,prot-pr PC1 registration.sex -0.656 2.76e-08
t55-gastrocnemius,prot-pr PC1 training.staffid 0.618 1.64e-07
t55-gastrocnemius,prot-pr PC1 terminal.weight.mg -0.501 4.60e-07
t55-gastrocnemius,prot-pr PC5 vo2.max.test.vo2_max_visit1 -0.426 3.67e-05
t58-heart,prot-ph PC1 registration.sex -0.853 6.44e-15
t58-heart,prot-ph PC1 training.staffid 0.606 2.15e-08
t58-heart,prot-ph PC1 terminal.weight.mg -0.748 1.80e-13
t58-heart,prot-pr PC1 registration.sex -0.866 1.29e-28
t58-heart,prot-pr PC1 training.staffid 0.520 6.68e-06
t58-heart,prot-pr PC1 vo2.max.test.vo2_max_visit1 0.559 2.92e-06
t58-heart,prot-pr PC1 terminal.weight.mg -0.739 7.38e-22
t58-heart,prot-ac PC1 registration.sex 0.598 2.00e-06
t58-heart,prot-ac PC1 terminal.weight.mg 0.579 9.29e-06
t58-heart,prot-ac PC2 registration.sex -0.562 9.30e-06
t58-heart,prot-ac PC2 terminal.weight.mg -0.483 4.53e-06
t58-heart,prot-ac PC3 is_control 0.524 5.15e-05
t58-heart,prot-ac PC5 is_control 0.544 3.48e-05
t58-heart,prot-ub PC1 registration.sex 0.676 1.18e-06
t58-heart,prot-ub PC1 terminal.weight.mg 0.558 7.50e-06
t58-heart,prot-ub PC3 is_control 0.505 3.22e-06
t59-kidney,prot-ph PC1 registration.sex -0.860 5.03e-26
t59-kidney,prot-ph PC1 vo2.max.test.vo2_max_visit1 0.594 2.89e-07
t59-kidney,prot-ph PC1 terminal.weight.mg -0.772 2.66e-22
t59-kidney,prot-pr PC1 registration.sex -0.866 1.21e-55
t59-kidney,prot-pr PC1 training.staffid 0.472 4.07e-05
t59-kidney,prot-pr PC1 vo2.max.test.vo2_max_visit1 0.553 4.73e-07
t59-kidney,prot-pr PC1 terminal.weight.mg -0.779 4.09e-31
t66-lung,prot-ph PC2 registration.sex 0.776 4.44e-12
t66-lung,prot-ph PC2 training.staffid -0.553 4.15e-07
t66-lung,prot-ph PC2 vo2.max.test.vo2_max_visit1 -0.468 8.37e-05
t66-lung,prot-ph PC2 terminal.weight.mg 0.654 4.05e-10
t66-lung,prot-pr PC2 registration.sex -0.683 6.66e-08
t66-lung,prot-pr PC2 terminal.weight.mg -0.611 1.05e-07
t66-lung,prot-pr PC4 registration.sex -0.601 8.39e-07
t66-lung,prot-pr PC4 training.staffid 0.575 4.72e-07
t66-lung,prot-pr PC4 terminal.weight.mg -0.494 7.34e-06
t68-liver,prot-ph PC1 registration.sex -0.864 4.26e-31
t68-liver,prot-ph PC1 training.staffid 0.565 2.57e-05
t68-liver,prot-ph PC1 vo2.max.test.vo2_max_visit1 0.575 1.76e-07
t68-liver,prot-ph PC1 terminal.weight.mg -0.714 3.91e-22
t68-liver,prot-pr PC1 registration.sex -0.864 1.58e-34
t68-liver,prot-pr PC1 vo2.max.test.vo2_max_visit1 0.588 1.57e-07
t68-liver,prot-pr PC1 terminal.weight.mg -0.735 8.31e-24
t68-liver,prot-ac PC1 registration.sex -0.861 2.37e-25
t68-liver,prot-ac PC1 training.staffid 0.583 3.77e-05
t68-liver,prot-ac PC1 vo2.max.test.vo2_max_visit1 0.540 8.38e-06
t68-liver,prot-ac PC1 terminal.weight.mg -0.677 1.21e-18
t68-liver,prot-ac PC3 training.staffid -0.512 4.85e-05
t68-liver,prot-ub PC1 registration.sex -0.862 8.08e-24
t68-liver,prot-ub PC1 training.staffid 0.599 9.65e-07
t68-liver,prot-ub PC1 vo2.max.test.vo2_max_visit1 0.602 1.81e-07
t68-liver,prot-ub PC1 terminal.weight.mg -0.733 3.59e-19
t70-white-adipose,prot-ph PC1 registration.sex -0.866 1.61e-34
t70-white-adipose,prot-ph PC1 training.staffid 0.476 5.32e-05
t70-white-adipose,prot-ph PC1 vo2.max.test.vo2_max_visit1 0.609 1.62e-07
t70-white-adipose,prot-ph PC1 terminal.weight.mg -0.754 8.79e-25
t70-white-adipose,prot-pr PC1 registration.sex -0.866 2.54e-34
t70-white-adipose,prot-pr PC1 training.staffid 0.539 3.19e-05
t70-white-adipose,prot-pr PC1 vo2.max.test.vo2_max_visit1 0.610 9.01e-08
t70-white-adipose,prot-pr PC1 terminal.weight.mg -0.763 1.90e-24
t70-white-adipose,prot-pr PC5 is_control -0.599 3.04e-07
kable(metadata_variable_assoc_report,longtable=TRUE,
      caption = "Correlations between metadata variables")%>%
  kable_styling(latex_options = c("repeat_header"),font_size = 8)
Correlations between metadata variables

3.4 PCA outliers

In this analysis, outliers are flagged by examining the boxplot of each PC, extending its whiskers to three times the inter-quantile range away from the boxplot. Samples outside this range are then flagged.

Note that samples are flagged using an automatic analysis of the principal components. Such analyses may flag samples because of true biological effects and thus further examination is required before determining if flagged samples represent low quality samples. Moreover, note that we plot outliers for the rii (raw intensitied) data as well. Typically, outliers from such datasets will not appear in the bettern normalized ratio data.

pca_outliers_report = c()
for(dataset_name in names(proteomics_data)){
  # Get the projected PCs
  curr_pcax = proteomics_data[[dataset_name]][["norm_filtered"]][["pca_b"]][[1]]  
  # Univariate: use IQRs
  pca_outliers = c()
  for(j in 1:num_pcs_for_outlier_analysis){
    
    #Identify outlier values
    outlier_values <- boxplot.stats(curr_pcax[,j],coef = 3)$out
    outlier_id <- curr_pcax[,j] %in% outlier_values
    
    #Extract outlier names
    outlier_values <- curr_pcax[outlier_id,j]
    names(outlier_values) <- rownames(curr_pcax)[outlier_id]
    
    for(outlier in names(outlier_values)){
      pca_outliers_report = rbind(pca_outliers_report,
       c(dataset_name,paste("PC",j,sep=""),outlier,
         format(outlier_values[outlier],digits=5))
      )
      if(!is.element(outlier,names(pca_outliers))){
        pca_outliers[outlier] = outlier_values[outlier]
      }
    }
  }
  
  # Plot the outliers
  if(length(pca_outliers)>0){
    df = data.frame(curr_pcax,
                outliers = rownames(curr_pcax) %in% names(pca_outliers))
    col = rep("black",nrow(df))
    col[df$outliers] = "green"
    plot(df$PC1,df$PC2,pch = as.numeric(df$outliers),col=col,lwd=2,cex=1,
         main = paste(dataset_name,"flagged outliers"),
         xlab = "PC1",ylab="PC2")
    plot(df$PC3,df$PC4,pch = as.numeric(df$outliers),col=col,lwd=2,cex=1,
         main = paste(dataset_name,"flagged outliers"),
         xlab = "PC3",ylab="PC4")
  }
}

if(!is.null(dim(pca_outliers_report))){
  colnames(pca_outliers_report) =  c("dataset","PC","sample","score")
}

3.5 Sex prediction accuracy

Here we map features to their chromosomes and then use the X and Y chromosomes to train a classifier for predicting sex. Note that unlike the sequencing platforms we use the molecular features from the data, which requires running a regularized learning algorithm. We use linear SVM as it seems to produce reasonable results.

entrez2chr = as.list(org.Rn.egCHR)
symbol2entrez = as.list(org.Rn.egSYMBOL2EG)
sex_pred_err_rates = c()
sex_check_outliers = c()
for(dataset_name in names(proteomics_data)){
  curr_data = proteomics_data[[dataset_name]][["norm_filtered"]][[2]]
  curr_annot = proteomics_data[[dataset_name]]$row_annot
  # get the correct row annotation (was filtered by NAs above)
  feature_inds = proteomics_data[[dataset_name]][["norm_filtered"]]$feature_inds
  curr_annot = curr_annot[feature_inds,]
  # map to entrez ids
  curr_entrez = symbol2entrez[curr_annot[,"gene_symbol"]]
  # take rows with a unique entrez id
  single_entrez  = sapply(curr_entrez,length)==1
  curr_data = curr_data[single_entrez,]
  curr_annot = curr_annot[single_entrez,]
  curr_entrez = unlist(curr_entrez[single_entrez])
  # map to chromosomes
  curr_chrs = entrez2chr[curr_entrez]
  # limit the data to X and Y chromosomes
  chrr_y_genes = sapply(curr_chrs,function(x)any(x=="Y" | x=="X"))
  chrr_y_genes[is.na(chrr_y_genes)] = F
  if(sum(chrr_y_genes,na.rm=TRUE)<2){next}
  sex = pheno_data$viallabel_data[colnames(curr_data),"registration.sex"]
  df = data.frame(sex=as.factor(sex),t(curr_data[chrr_y_genes,]))
  df = df[,colSums(is.na(df))==0]
  # run an svm classifier
  train_control <- caret::trainControl(method = "cv", number = nrow(df),
                                savePredictions = TRUE)
  # train the model on training set
  model <- caret::train(sex ~ .,data = df,
               trControl = train_control,method = "svmLinear2")
  # loocv results
  cv_res = model$pred
  cv_res = cv_res[cv_res[,"cost"]==0.5,]
  cv_errors = cv_res[,1] != cv_res[,2]
  acc = 1 - (sum(cv_errors)/nrow(cv_res))
  print(paste(
    "In dataset",dataset_name,
    "sex prediction using X,Y chromosomes, cross-validation accuracy is:",
    round(acc,digits = 3)))
  sex_pred_err_rates = rbind(sex_pred_err_rates,
      c(dataset_name,mean(cv_errors))
  )
  err_samples = rownames(df)[cv_errors]
  for(samp in err_samples){
    sex_check_outliers = rbind(
      sex_check_outliers,c(dataset_name,samp))
  }
}
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t53-cortex,prot-ph sex prediction using X,Y chromosomes, cross-validation accuracy is: 0.8"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t53-cortex,prot-pr sex prediction using X,Y chromosomes, cross-validation accuracy is: 1"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t55-gastrocnemius,prot-ph sex prediction using X,Y chromosomes, cross-validation accuracy is: 0.9"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t55-gastrocnemius,prot-pr sex prediction using X,Y chromosomes, cross-validation accuracy is: 0.983"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t58-heart,prot-ph sex prediction using X,Y chromosomes, cross-validation accuracy is: 1"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t58-heart,prot-pr sex prediction using X,Y chromosomes, cross-validation accuracy is: 1"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t58-heart,prot-ac sex prediction using X,Y chromosomes, cross-validation accuracy is: 0.926"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t58-heart,prot-ub sex prediction using X,Y chromosomes, cross-validation accuracy is: 0.933"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t59-kidney,prot-ph sex prediction using X,Y chromosomes, cross-validation accuracy is: 1"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t59-kidney,prot-pr sex prediction using X,Y chromosomes, cross-validation accuracy is: 1"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t66-lung,prot-ph sex prediction using X,Y chromosomes, cross-validation accuracy is: 0.967"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t66-lung,prot-pr sex prediction using X,Y chromosomes, cross-validation accuracy is: 1"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t68-liver,prot-ph sex prediction using X,Y chromosomes, cross-validation accuracy is: 0.983"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t68-liver,prot-pr sex prediction using X,Y chromosomes, cross-validation accuracy is: 0.983"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t68-liver,prot-ac sex prediction using X,Y chromosomes, cross-validation accuracy is: 0.981"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t68-liver,prot-ub sex prediction using X,Y chromosomes, cross-validation accuracy is: 1"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t70-white-adipose,prot-ph sex prediction using X,Y chromosomes, cross-validation accuracy is: 0.983"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t70-white-adipose,prot-pr sex prediction using X,Y chromosomes, cross-validation accuracy is: 1"
if(! is.null(dim(sex_check_outliers))){
  colnames(sex_check_outliers) = c("dataset","sample")
}

3.6 Summary of PCA and sex prediction outliers

sample2dataset = c()
for(dataset_name in names(proteomics_data)){
  sample2dataset[colnames(proteomics_data[[dataset_name]]$sample_data)] = dataset_name
}
# add the dataset name to mop outliers
all_flagged = NULL
if(length(pca_outliers_report)>0){
  all_flagged = union(all_flagged,pca_outliers_report[,"sample"])
}
if(length(sex_check_outliers)>0){
  all_flagged = union(all_flagged,sex_check_outliers[,2])
}

flagged_sample_report = c()
for(samp in all_flagged){
  samp_dataset = sample2dataset[samp]
  samp_metric = NULL
  if(!is.null(dim(sex_check_outliers)) &&
     samp %in% sex_check_outliers[,"sample"]){
    samp_metric = c(samp_metric,"Sex-flagged")
  }
  if(!is.null(dim(pca_outliers_report)) &&
     samp %in% pca_outliers_report[,"sample"]){
    curr_pcs = pca_outliers_report[pca_outliers_report[,"sample"]==samp,"PC"]
    samp_metric = c(samp_metric,curr_pcs)
  }
  flagged_sample_report = rbind(flagged_sample_report,
      c(samp,samp_dataset,paste(samp_metric,collapse=","))
  )
}

colnames(flagged_sample_report) = c("Viallabel","Dataset","Methods")
kable(flagged_sample_report,longtable=TRUE,
      caption = "Outliers detected by tissue PCA data")%>%
  kable_styling(font_size = 8,latex_options = c("hold_position", "repeat_header"))
Outliers detected by tissue PCA data
Viallabel Dataset Methods
90225015305 t53-cortex,prot-pr Sex-flagged,PC2,PC3,PC2
90245015305 t53-cortex,prot-pr PC2,PC3
90421015305 t53-cortex,prot-pr PC2,PC3,PC2
90223015507 t55-gastrocnemius,prot-pr Sex-flagged,PC2
90237015805 t58-heart,prot-ub PC3
90245015805 t58-heart,prot-ub PC3
90432015802 t58-heart,prot-ub PC3
90444015805 t58-heart,prot-ub Sex-flagged,PC3
90559015805 t58-heart,prot-ub Sex-flagged,PC2
90283015305 t53-cortex,prot-pr Sex-flagged
90406015305 t53-cortex,prot-pr Sex-flagged
90280015305 t53-cortex,prot-pr Sex-flagged
90581015305 t53-cortex,prot-pr Sex-flagged
90410015305 t53-cortex,prot-pr Sex-flagged
90222015305 t53-cortex,prot-pr Sex-flagged
90248015305 t53-cortex,prot-pr Sex-flagged
90258015305 t53-cortex,prot-pr Sex-flagged
90559015305 t53-cortex,prot-pr Sex-flagged
90564015305 t53-cortex,prot-pr Sex-flagged
90439015305 t53-cortex,prot-pr Sex-flagged
90406015507 t55-gastrocnemius,prot-pr Sex-flagged
90231015503 t55-gastrocnemius,prot-pr Sex-flagged
90259015507 t55-gastrocnemius,prot-pr Sex-flagged
90283015507 t55-gastrocnemius,prot-pr Sex-flagged
90252015507 t55-gastrocnemius,prot-pr Sex-flagged
90258015805 t58-heart,prot-ub Sex-flagged
90239015805 t58-heart,prot-ub Sex-flagged
90289015805 t58-heart,prot-ub Sex-flagged
90252015805 t58-heart,prot-ub Sex-flagged
90234015802 t58-heart,prot-ub Sex-flagged
90217016609 t66-lung,prot-pr Sex-flagged
90255016609 t66-lung,prot-pr Sex-flagged
90560016806 t68-liver,prot-ub Sex-flagged
90430017006 t70-white-adipose,prot-pr Sex-flagged

4. Remove outlier samples

Specify samples that will be removed from differential enrichment results and from the normalized outputs produced by this notebook.

#Identify samples that need to be removed as a named list for each ome
VIALS_TO_REMOVE <- list(
  #Samples removed from global proteome data
  "t68-liver,prot-pr" = c("90560016806"),
  "t55-gastrocnemius,prot-pr" = c("90252015507","90245015507","90223015507"),
  
  #Samples removed from global phosphoproteome data
  "t68-liver,prot-ph" = c("90560016806"),
  "t55-gastrocnemius,prot-ph" = c("90252015507","90245015507","90223015507"),
  
  #Samples removed from global acetylome data
  "t68-liver,prot-ac" = c("90560016806"),
  
  #Samples removed from global ubiquitylome data
  "t58-heart,prot-ub" = c("90559015805")
)

This code removes the outlier samples from downstream differential analysis and output tables

#Loop through each dataset to remove outlier samples
for(dataset in names(VIALS_TO_REMOVE)){
  #Remove sample from the original sample matrix
  proteomics_data[[dataset]]$sample_data <-
    proteomics_data[[dataset]]$sample_data %>% 
    dplyr::select(-all_of(VIALS_TO_REMOVE[[dataset]]))
  
  #Remove samples from the metadata matrix
  proteomics_data[[dataset]]$sample_meta <-
    proteomics_data[[dataset]]$sample_meta %>% 
    filter(!(vial_label %in% VIALS_TO_REMOVE[[dataset]]))
  
  # Loop through the various normalized datasets and remove the outlier sample
  for(i in names(proteomics_data[[dataset]]$norm_filtered)){
    if (!(i %in% c("feature_inds","pca_b","pca_w_b"))){
      to_keep <- !(colnames(proteomics_data[[dataset]]$norm_filtered[[i]]) %in% VIALS_TO_REMOVE[[dataset]])
      
      proteomics_data[[dataset]]$norm_filtered[[i]] <-
        proteomics_data[[dataset]]$norm_filtered[[i]][,to_keep]
    }
  }
}

5. Remove contaminant proteins

This chunk remove contaminant proteins routinely detected during MS analysis. Contaminants are removed at this step and not earlier as they may be useful for the PTM-Protein correction step and for quality control

for(dataset_name in names(proteomics_data)){
  #Identify contaminant features
  contaminant_ids <- proteomics_data[[dataset_name]]$row_annot %>%
    as.data.frame %>% filter(is_contaminant == TRUE) %>% rownames
  
  #Update feature index
  proteomics_data[[dataset_name]]$norm_filtered$feature_inds[contaminant_ids] <- F 
  
  # Loop through the various normalized datasets and remove contaminant features
  for(i in names(proteomics_data[[dataset]]$norm_filtered)){
    if (!(i %in% c("feature_inds","pca_b","pca_w_b"))){
      to_keep <- setdiff(rownames(proteomics_data[[dataset_name]]$norm_filtered[[i]]),
                                          contaminant_ids)
      proteomics_data[[dataset_name]]$norm_filtered[[i]] <-
        proteomics_data[[dataset_name]]$norm_filtered[[i]][to_keep,]
    }
  }
}

6. Differential analysis

Differential analysis is performed using two approaches. Section 6.1 uses a model that combines both animal sexes, while section 6.2 fits a model on samples from each sex. Only results from section 6.2 are saved and uploaded to the Google bucket.

Future implementations can modify Section 6.1 to account for sex-specific residuals. For example, by using generalized least squares.

6.1 DEA without splitting sexes

Here, we use a combined model for both sexes.

First, perform DEA on non-corrected data

# a helper function for getting SEs per estimate in the timewise
# data analysis
limma_res_extract_se<-function(limma_res,e_fit,
                               effect_col="logFC",t_col="t",colname=NULL){
  # method 1
  effects = limma_res[[effect_col]]
  ts = limma_res[[t_col]]
  ses1 = effects/ts
  if(is.null(colname)){
    return(ses1)
  }
  # method 2
  ses2 = sqrt(e_fit$s2.post) * e_fit$stdev.unscaled
  ts1 = effects/ses1
  ts2 = effects/ses2[,colname]
  if(max(abs(ts-ts1),na.rm = T) > 1e-10 || 
     max(abs(ts-ts2),na.rm = T) > 1e-10 || 
     max(abs(ts1-ts2),na.rm = T)>1e-10){
    print("Warning: t-stats from computed ses are incompatible with limma's output")
  }
  return(ses2[,colname])
}

ftest_res = c() # keeps all ftest results
sex_ftest_res = c() # keeps the sex-specific f-tests
dataset2num_discoveries = c() # count stats
tpDA = c() # keep the timewise results

for(dataset_name in names(proteomics_data)){
  
  
  #Extract current dataset
  x = proteomics_data[[dataset_name]][["norm_filtered"]][[1]]
  
  #Extract sample annotations
  x_annot = proteomics_data[[dataset_name]]$row_annot
  feature_inds = proteomics_data[[dataset_name]][["norm_filtered"]]$feature_inds
  x_annot = x_annot[feature_inds,]
    covs = data.frame(
    sex = pheno_data$viallabel_data[colnames(x),"registration.sex"],
    timepoint = pheno_data$viallabel_data[colnames(x),"timepoint"],
    is_control = pheno_data$viallabel_data[colnames(x),"is_control"]
  )
  covs$sex[covs$sex=="2"] = "M"; covs$sex[covs$sex=="1"]="F"
  rownames(covs) = colnames(x)
  
  #Specify the model
  full_model_str = "~1+sex+group+sex:group"
  
  
  
  # F-test analysis - training-dea table
  # Fit a single limma model for F-test.
  # Perform F-test using the exercise groups and group:sex interactions
  # see https://support.bioconductor.org/p/12119/ and the limma guide
  group=factor(paste(covs$timepoint,covs$is_control,sep="_"),
               levels=c("8_1","1_0","2_0", "4_0", "8_0"))
  covs$group=group
  des = model.matrix(as.formula(full_model_str),data=covs)
  is_group_variable = grepl("group",colnames(des))
  colnames(des) = gsub("group","",colnames(des))
  colnames(des)[1] = "Intercept"

  # fit the models
  limma_model1 = lmFit(x,as.matrix(des))
  eb_Ftest = eBayes(limma_model1)
  
  # Extract the results
  res <- topTable(eb_Ftest,coef = which(is_group_variable),
                  number = Inf, sort.by = "none")
  
  num_sig1 = sum(p.adjust(res$P.Value,method="fdr")<0.1,na.rm=TRUE)
    
  curr_f_test_res = data.frame(
      feature_ID = rownames(x),
      assay = strsplit(dataset_name,split=",")[[1]][2],
      tissue = strsplit(dataset_name,split=",")[[1]][1],
      p_value = res$P.Value,
      adj_p_value = p.adjust(res$P.Value,method="fdr"),
      fscore = res$`F`,
      numNAs = rowSums(is.na(x)),
      full_model = full_model_str,
      reduced_model = "~1+sex"
  )
  
  ftest_res = rbind(ftest_res,curr_f_test_res)
  
  # F-test analysis - training-sex-biased table
  # Perform F-test using the group:sex interactions
  inter_terms = grepl(":",colnames(des))
  res_sex <- topTable(eb_Ftest,coef = which(inter_terms),
                      number = Inf, sort.by = "none")
  num_sig_sex = sum(p.adjust(res_sex$P.Value,method="fdr")<0.1,na.rm=TRUE)
  curr_sex_f_test_res = data.frame(
    feature_ID = rownames(x),
    assay = strsplit(dataset_name,split=",")[[1]][2],
    tissue = strsplit(dataset_name,split=",")[[1]][1],
    p_value = res_sex$P.Value,
    adj_p_value = p.adjust(res_sex$P.Value,method="fdr"),
    fscore = res_sex$`F`,
    numNAs = rowSums(is.na(x)),
    full_model = "~1+sex+group+sex:group",
    reduced_model = "~1+sex+group"
  )
  sex_ftest_res = rbind(sex_ftest_res,curr_sex_f_test_res)
    
  
  
  
  
  # T-test analysis - timewise-dea table
  # Perform T-test using the exercise groups and group:sex interactions
  group=factor(paste(covs$sex,covs$timepoint,covs$is_control,sep="_"))
  des = model.matrix(~0+group)
  colnames(des) = gsub("group","",colnames(des))
  C_ttests = makeContrasts(
      M_1_0 - M_8_1,M_2_0 - M_8_1,M_4_0 - M_8_1,M_8_0 - M_8_1,
      F_1_0 - F_8_1,F_2_0 - F_8_1,F_4_0 - F_8_1,F_8_0 - F_8_1,
      levels = des
  )

  # fit the new model
  limma_model2 = lmFit(x,as.matrix(des))
  lmfit.cont <- contrasts.fit(limma_model2, C_ttests)
  lmfit.cont.ebayes <- eBayes(lmfit.cont)
  
  # count the results at 10% FDR
  t_ps = c(lmfit.cont.ebayes$p.value)
  suppressWarnings({pthr = max(-1,max(t_ps[p.adjust(t_ps,method="BY")<0.1]))})
  num_sig2 = sum(apply(lmfit.cont.ebayes$p.value<pthr,1,any),na.rm = T)
  suppressWarnings({pthr = max(-1,max(t_ps[p.adjust(t_ps,method="fdr")<0.1]))})
  num_sig3 = sum(apply(lmfit.cont.ebayes$p.value<pthr,1,any),na.rm = T)
  v_counts = c(dataset=dataset_name,ftest=num_sig1,
               sex_ftest=num_sig_sex,ttests_BY=num_sig2,ttests_BH=num_sig3)
  dataset2num_discoveries = rbind(dataset2num_discoveries,v_counts)

  # extract contrast info
  for(col in colnames(lmfit.cont.ebayes$t)){
    curr_sex = strsplit(col,split="_")[[1]][1]
    sex_str = "male"
    if(curr_sex=="F"){sex_str="female"}
    curr_tp = strsplit(col,split="_")[[1]][2]
    samps = rownames(covs)[covs$sex==curr_sex &
                 (covs$timepoint==curr_tp | covs$is_control==1)]
    if(any(table(covs[samps,"is_case"])<2)){
        print(paste("in dataset:",dataset,"skipping time point",col,"of sex",sex))
        print(paste("reason: there is a group with less than 2 samples"))
        next
    }
    limma_res = topTable(lmfit.cont.ebayes,
                    coef = col,number = nrow(x),sort.by = "none")
    
    curr_res = data.frame(
      feature_ID = rownames(x),
      assay = strsplit(dataset_name,split=",")[[1]][2],
      tissue = strsplit(dataset_name,split=",")[[1]][1],
      sex = sex_str,
      logFC_se = limma_res_extract_se(limma_res,lmfit.cont.ebayes),
      logFC = limma_res$logFC,
      tscore = limma_res$t,
      covariates = NA,
      comparison_group = paste0(curr_tp,"w"),
      p_value = limma_res$P.Value,
      adj_p_value = limma_res$adj.P.Val
    )
    # add group info
    case_samps = samps[covs[samps,"is_case"]=="1"]
    curr_res$comparison_average_intensity = apply(x[,case_samps],1,mean)
    control_samps = samps[covs[samps,"is_case"]!="1"]
    curr_res$reference_average_intensity = apply(x[,control_samps],1,mean)
    # add NA counts
    curr_res$numNAs = rowSums(is.na(x[,samps]))
    # add the results
    tpDA = rbind(tpDA,curr_res)
  }
}

Perform DEA on PTM protein-corrected data.

#Perform DEA only on PTM data
ptm_datasets <- grep("prot-pr",names(proteomics_data),value = T,invert = T)

for(dataset_name in ptm_datasets){
  x = proteomics_data[[dataset_name]][["norm_filtered"]][["norm_data_b_protein_corrected"]]
  #Extract sample annotations
  x_annot = proteomics_data[[dataset_name]]$row_annot
  feature_inds = proteomics_data[[dataset_name]][["norm_filtered"]]$feature_inds
  x_annot = x_annot[feature_inds,]
    covs = data.frame(
    sex = pheno_data$viallabel_data[colnames(x),"registration.sex"],
    timepoint = pheno_data$viallabel_data[colnames(x),"timepoint"],
    is_control = pheno_data$viallabel_data[colnames(x),"is_control"]
  )
  covs$sex[covs$sex=="2"] = "M";covs$sex[covs$sex=="1"]="F"
  rownames(covs) = colnames(x)
  
  #Specify the model
  full_model_str = "~1+sex+group+sex:group"
  
  # F-test analysis - training-dea table
  # Fit a single limma model for F-test.
  # Perform F-test using the exercise groups and group:sex interactions
  # see https://support.bioconductor.org/p/12119/ and the limma guide
  group=factor(paste(covs$timepoint,covs$is_control,sep="_"),
               levels=c("8_1","1_0","2_0", "4_0", "8_0"))
  covs$group=group
  des = model.matrix(as.formula(full_model_str),data=covs)
  is_group_variable = grepl("group",colnames(des))
  colnames(des) = gsub("group","",colnames(des))
  colnames(des)[1] = "Intercept"

  # fit the models
  limma_model1 = lmFit(x,as.matrix(des))
  eb_Ftest = eBayes(limma_model1)
  
  # Extract the results
  res <- topTable(eb_Ftest,coef = which(is_group_variable),
                  number = Inf, sort.by = "none")
  
  num_sig1 = sum(p.adjust(res$P.Value,method="fdr")<0.1,na.rm=TRUE)
    
  curr_f_test_res = data.frame(
      feature_ID = rownames(x),
      assay = paste0(strsplit(dataset_name,split=",")[[1]][2],"-protein-corrected"),
      tissue = strsplit(dataset_name,split=",")[[1]][1],
      p_value = res$P.Value,
      adj_p_value = p.adjust(res$P.Value,method="fdr"),
      fscore = res$`F`,
      numNAs = rowSums(is.na(x)),
      full_model = full_model_str,
      reduced_model = "~1+sex"
  )
  
  ftest_res = rbind(ftest_res,curr_f_test_res)

  
  # F-test analysis - training-sex-biased table
  # Perform F-test using the group:sex interactions
  inter_terms = grepl(":",colnames(des))
  res_sex <- topTable(eb_Ftest,coef = which(inter_terms),
                      number = Inf, sort.by = "none")
  num_sig_sex = sum(p.adjust(res_sex$P.Value,method="fdr")<0.1,na.rm=TRUE)
  curr_sex_f_test_res = data.frame(
    feature_ID = rownames(x),
    assay = paste0(strsplit(dataset_name,split=",")[[1]][2],"-protein-corrected"),
    tissue = strsplit(dataset_name,split=",")[[1]][1],
    p_value = res_sex$P.Value,
    adj_p_value = p.adjust(res_sex$P.Value,method="fdr"),
    fscore = res_sex$`F`,
    numNAs = rowSums(is.na(x)),
    full_model = "~1+sex+group+sex:group",
    reduced_model = "~1+sex+group"
  )
  sex_ftest_res = rbind(sex_ftest_res,curr_sex_f_test_res)
    
  
  # T-test analysis - timewise-dea table
  # Perform T-test using the exercise groups and group:sex interactions
  group=factor(paste(covs$sex,covs$timepoint,covs$is_control,sep="_"))
  des = model.matrix(~0+group)
  colnames(des) = gsub("group","",colnames(des))
  C_ttests = makeContrasts(
      M_1_0 - M_8_1,M_2_0 - M_8_1,M_4_0 - M_8_1,M_8_0 - M_8_1,
      F_1_0 - F_8_1,F_2_0 - F_8_1,F_4_0 - F_8_1,F_8_0 - F_8_1,
      levels = des
  )

  # fit the new model
  limma_model2 = lmFit(x,as.matrix(des))
  lmfit.cont <- contrasts.fit(limma_model2, C_ttests)
  lmfit.cont.ebayes <- eBayes(lmfit.cont)
  
  # count the results at 10% FDR
  t_ps = c(lmfit.cont.ebayes$p.value)
  suppressWarnings({pthr = max(-1,max(t_ps[p.adjust(t_ps,method="BY")<0.1]))})
  num_sig2 = sum(apply(lmfit.cont.ebayes$p.value<pthr,1,any),na.rm = T)
  suppressWarnings({pthr = max(-1,max(t_ps[p.adjust(t_ps,method="fdr")<0.1]))})
  num_sig3 = sum(apply(lmfit.cont.ebayes$p.value<pthr,1,any),na.rm = T)
  v_counts = c(dataset=dataset_name,ftest=num_sig1,
               sex_ftest=num_sig_sex,ttests_BY=num_sig2,ttests_BH=num_sig3)
  dataset2num_discoveries = rbind(dataset2num_discoveries,v_counts)

  # extract contrast info
  for(col in colnames(lmfit.cont.ebayes$t)){
    curr_sex = strsplit(col,split="_")[[1]][1]
    sex_str = "male"
    if(curr_sex=="F"){sex_str="female"}
    curr_tp = strsplit(col,split="_")[[1]][2]
    samps = rownames(covs)[covs$sex==curr_sex &
                 (covs$timepoint==curr_tp | covs$is_control==1)]
    if(any(table(covs[samps,"is_case"])<2)){
        print(paste("in dataset:",dataset,"skipping time point",col,"of sex",sex))
        print(paste("reason: there is a group with less than 2 samples"))
        next
    }
    limma_res = topTable(lmfit.cont.ebayes,
                    coef = col,number = nrow(x),sort.by = "none")
    
    curr_res = data.frame(
      feature_ID = rownames(x),
      assay = paste0(strsplit(dataset_name,split=",")[[1]][2],"-protein-corrected"),
      tissue = strsplit(dataset_name,split=",")[[1]][1],
      sex = sex_str,
      logFC_se = limma_res_extract_se(limma_res,lmfit.cont.ebayes),
      logFC = limma_res$logFC,
      tscore = limma_res$t,
      covariates = NA,
      comparison_group = paste0(curr_tp,"w"),
      p_value = limma_res$P.Value,
      adj_p_value = limma_res$adj.P.Val
    )
    # add group info
    case_samps = samps[covs[samps,"is_case"]=="1"]
    curr_res$comparison_average_intensity = apply(x[,case_samps],1,mean,na.rm=TRUE)
    control_samps = samps[covs[samps,"is_case"]!="1"]
    curr_res$reference_average_intensity = apply(x[,control_samps],1,mean,na.rm=TRUE)
    # add NA counts
    curr_res$numNAs = rowSums(is.na(x[,samps]))
    # add the results
    tpDA = rbind(tpDA,curr_res)
  }
}

6.2 DEA separating sexes

Here, we perform DEA by fitting models to only Male or only Female samples.

First run on non-corrected data.

ftest_res_split_sex = c() # keeps all ftest results
tpDA_split_sex = c() # keep the timewise results

for(dataset_name in names(proteomics_data)){
  
  #Extract current dataset and metadata
  x = proteomics_data[[dataset_name]][["norm_filtered"]][[1]]
  x_annot = proteomics_data[[dataset_name]]$row_annot
  feature_inds = proteomics_data[[dataset_name]][["norm_filtered"]]$feature_inds
  x_annot = x_annot[feature_inds,]
  
  #Specify covariates
  covs = data.frame(
    sex = pheno_data$viallabel_data[colnames(x),"registration.sex"],
    timepoint = pheno_data$viallabel_data[colnames(x),"timepoint"],
    is_control = pheno_data$viallabel_data[colnames(x),"is_control"]
  )
  covs$sex[covs$sex=="2"] = "M";covs$sex[covs$sex=="1"]="F"
  rownames(covs) = colnames(x)
  
  #Variables that save sex-specific results
  sex_res = list() #F-test result
  sex_ttest_res = list() #T-test result
  
  for(SEX in c('M','F')){
    curr_meta = covs %>% filter(sex == SEX) %>% mutate(group = paste(timepoint,is_control,sep="_"))
    curr_counts = x[,rownames(curr_meta)]
    
    ###################################################################
    # F-test analysis - training-dea table
    
    #Extract treatment covariate
    tr <- factor(paste(curr_meta$timepoint,curr_meta$is_control,sep="_"),
                 levels=c("8_1","1_0","2_0","4_0","8_0"))
    #Generate the experimental model
    design <- model.matrix(~ 1+tr)
    fit <- lmFit(curr_counts, design)
    fit.eb <- eBayes(fit)
    
    
    #Extract results
    res = topTable(fit.eb, coef = 2:5, n=nrow(fit.eb))
    dt = data.table(tissue=str_split(dataset_name,",")[[1]][1],
                    assay=str_split(dataset_name,",")[[1]][2],
                    feature_ID=rownames(res),
                    fscore=res$`F`,
                    p_value = res$P.Value,
                    full_model = "~1+group",
                    reduced_model = "~1")
    sex_res[[SEX]] = dt
    
    ###############################################################
    # T-test analysis - timewise-dea table
    # Perform T-test using the exercise groups
    
    design = model.matrix(~0+tr)
    
    #Set contrasts
    cont.matrix = makeContrasts(
      tr1_0 - tr8_1, tr2_0 - tr8_1, tr4_0 - tr8_1, tr8_0 - tr8_1, 
      levels = design
    )
    
    colnames(cont.matrix) = c("1w","2w","4w","8w")
    
    # Fit the new model
    limma_model2 = lmFit(curr_counts,design)
    lmfit.cont <- contrasts.fit(limma_model2, cont.matrix)
    lmfit.cont.ebayes <- eBayes(lmfit.cont)

    #Extract results for each timepoint
    for(curr_tp in colnames(lmfit.cont.ebayes$t)){
      
      #Extract results
      limma_res = topTable(lmfit.cont.ebayes,
                           coef = curr_tp,number = Inf,sort.by = "none")
    
      curr_res = data.frame(
        feature_ID = rownames(limma_res),
        assay = strsplit(dataset_name,split=",")[[1]][2],
        tissue = strsplit(dataset_name,split=",")[[1]][1],
        sex = if_else(SEX == "M","male","female"),
        logFC_se = limma_res_extract_se(limma_res,lmfit.cont.ebayes),
        logFC = limma_res$logFC,
        tscore = limma_res$t,
        covariates = NA,
        comparison_group = curr_tp,
        p_value = limma_res$P.Value
      )
      # Add group average intensities
      case_samps = covs %>% 
        filter(sex == SEX &
                 timepoint == substr(curr_tp,1,1)&
                 is_control == 0) %>%
        rownames()
      curr_res$comparison_average_intensity = apply(x[,case_samps],1,mean,na.rm=TRUE)
      
      control_samps = covs %>% 
        filter(sex == SEX &
                 is_control == 1) %>%
        rownames()
      curr_res$reference_average_intensity = apply(x[,control_samps],1,mean,na.rm=TRUE)
      
      # Add NA counts
      curr_res$numNAs = rowSums(is.na(x[,c(case_samps,control_samps)]))
      
      # Add the results
      tpDA_split_sex = rbind(tpDA_split_sex,curr_res)
    }
  }
  
  #Merge F-test results
  merged = data.frame(merge(sex_res[['M']], sex_res[['F']], 
                            by=c("tissue","assay","feature_ID","full_model","reduced_model"), 
                            suffixes=c('_male','_female'))) %>%
    mutate(p_value_male = replace_na(p_value_male,1),
           p_value_female = replace_na(p_value_female,1)) %>%
    mutate(p_value = map2_dbl(p_value_male,p_value_female,function(x,y){sumlog(c(x,y))$p}))
  
  ftest_res_split_sex <- rbind(ftest_res_split_sex,merged)
}

Next, run on protein-corrected PTM data

#Perform DEA only on PTM data
ptm_datasets <- grep("prot-pr", names(proteomics_data),value = T,invert = T)

for(dataset_name in ptm_datasets){
  
  #Extract current dataset and metadata
  x = proteomics_data[[dataset_name]][["norm_filtered"]][["norm_data_b_protein_corrected"]]
  x_annot = proteomics_data[[dataset_name]]$row_annot
  feature_inds = proteomics_data[[dataset_name]][["norm_filtered"]]$feature_inds
  x_annot = x_annot[feature_inds,]
  
  #Specify covariates
  covs = data.frame(
    sex = pheno_data$viallabel_data[colnames(x),"registration.sex"],
    timepoint = pheno_data$viallabel_data[colnames(x),"timepoint"],
    is_control = pheno_data$viallabel_data[colnames(x),"is_control"]
  )
  covs$sex[covs$sex=="2"] = "M";covs$sex[covs$sex=="1"]="F"
  rownames(covs) = colnames(x)
  
  #Variables that save sex-specific results
  sex_res = list() #F-test result
  sex_ttest_res = list() #T-test result
  
  for(SEX in c('M','F')){
    curr_meta = covs %>% filter(sex == SEX) %>% mutate(group = paste(timepoint,is_control,sep="_"))
    curr_counts = x[,rownames(curr_meta)]

    ###################################################################
    # F-test analysis - training-dea table
    
    #Extract treatment covariate
    tr <- factor(paste(curr_meta$timepoint,curr_meta$is_control,sep="_"),
                 levels=c("8_1","1_0","2_0","4_0","8_0"))
    #Generate the experimental model
    design <- model.matrix(~ 1+tr)
    fit <- lmFit(curr_counts, design)
    fit.eb <- eBayes(fit)
    
    
    #Extract results
    res = topTable(fit.eb, coef = 2:5, n=nrow(fit.eb))
    dt = data.table(
      tissue=str_split(dataset_name,",")[[1]][1],
      assay = paste0(strsplit(dataset_name,split=",")[[1]][2],"-protein-corrected"),
      feature_ID=rownames(res),
      fscore=res$`F`,
      p_value = res$P.Value,
      full_model = "~1+group",
      reduced_model = "~1")
    sex_res[[SEX]] = dt
    
    ###############################################################
    # T-test analysis - timewise-dea table
    # Perform T-test using the exercise groups
    
    design = model.matrix(~0+tr)
    
    #Set contrasts
    cont.matrix = makeContrasts(
      tr1_0 - tr8_1, tr2_0 - tr8_1, tr4_0 - tr8_1, tr8_0 - tr8_1, 
      levels = design
    )
    
    colnames(cont.matrix) = c("1w","2w","4w","8w")

    
    # Fit the new model
    limma_model2 = lmFit(curr_counts,design)
    lmfit.cont <- contrasts.fit(limma_model2, cont.matrix)
    lmfit.cont.ebayes <- eBayes(lmfit.cont)
    

    #Extract results for each timepoint
    for(curr_tp in colnames(lmfit.cont.ebayes$t)){
      
      #Extract results
      limma_res = topTable(lmfit.cont.ebayes,
                           coef = curr_tp,
                           number = Inf,
                           sort.by = "none")
    
      curr_res = data.frame(
        feature_ID = rownames(limma_res),
        assay = paste0(strsplit(dataset_name,split=",")[[1]][2],"-protein-corrected"),
        tissue = strsplit(dataset_name,split=",")[[1]][1],
        sex = if_else(SEX == "M","male","female"),
        logFC_se = limma_res_extract_se(limma_res,lmfit.cont.ebayes),
        logFC = limma_res$logFC,
        tscore = limma_res$t,
        covariates = NA,
        comparison_group = curr_tp,
        p_value = limma_res$P.Value
      )
      # Add group average intensities
      case_samps = covs %>% 
        filter(sex == SEX &
                 timepoint == substr(curr_tp,1,1)&
                 is_control == 0) %>%
        rownames()
      curr_res$comparison_average_intensity = apply(x[,case_samps],1,mean,na.rm=TRUE)
      
      control_samps = covs %>% 
        filter(sex == SEX &
                 is_control == 1) %>%
        rownames()
      curr_res$reference_average_intensity = apply(x[,control_samps],1,mean,na.rm=TRUE)
      
      # Add NA counts
      curr_res$numNAs = rowSums(is.na(x[,c(case_samps,control_samps)]))
      
      # Add the results
      tpDA_split_sex = rbind(tpDA_split_sex,curr_res)
    }
  }
  
  #Merge F-test results
  merged = data.frame(merge(sex_res[['M']], sex_res[['F']], 
                            by=c("tissue","assay","feature_ID","full_model","reduced_model"), 
                            suffixes=c('_male','_female'))) %>%
    mutate(p_value_male = replace_na(p_value_male,1),
           p_value_female = replace_na(p_value_female,1)) %>%
    mutate(p_value = map2_dbl(p_value_male,p_value_female,function(x,y){sumlog(c(x,y))$p}))
  
  ftest_res_split_sex <- rbind(ftest_res_split_sex,merged)
}

Scatter plots are displayed below to assess the effects of protein correction of PTM data on the DEA results.

ptm_datasets <- grep("prot-pr", names(proteomics_data),value = T,invert = T)

for(dataset_name in ptm_datasets){
  cur_tissue <- str_split_fixed(dataset_name,",",2)[1]
  cur_assay <- str_split_fixed(dataset_name,",",2)[2]

  uncorrected <-  ftest_res_split_sex %>%
    filter(tissue == cur_tissue &
             assay == cur_assay)

  corrected <- ftest_res_split_sex %>%
    filter(tissue == cur_tissue &
             (assay == paste0(cur_assay,"-protein-corrected")))

  dat <- inner_join(uncorrected,corrected,
                    by = c("tissue","feature_ID","full_model","reduced_model"),
                    suffix = c(".uncorrected",".corrected"))

  g <- ggplot(dat, aes(x = -log10(p_value.uncorrected),
                  y = -log10(p_value.corrected)))+
    geom_point(alpha=0.01)+
    theme_bw()+
    geom_smooth(method="lm",formula = y~x)+
    labs(x = "Uncorrected PTM training-dea\n-log10(p-val)",
         y = "Protein-corrected PTM training-dea\n-log10(p-val)",
         title= dataset_name)
  
  plot(g)
}

6.3 Save DEA results

Save DEA results that split datasets by sex.

tissue_assay_comb = unique(tpDA_split_sex[,c("tissue","assay")])

# timewise tables
for(i in 1:nrow(tissue_assay_comb)){
  tissue = tissue_assay_comb[i,1]
  assay = tissue_assay_comb[i,2]
  currDA = tpDA_split_sex[tpDA_split_sex$tissue == tissue & tpDA_split_sex$assay == assay,]
  local_fname = file.path(local_data_dir, paste0("pass1b-06_",tissue,"_",assay,"_timewise-dea_",suffix))
  curr_out_bucket = paste0(out_bucket,str_extract(assay,"prot-.."),"/",dea_folder_name,"/")
  message("Write Local: ", local_fname)
  write.table(currDA,
              file=local_fname,
              sep="\t",
              quote=FALSE,
              row.names = FALSE,
              col.names = TRUE)
  if(upload_to_bucket){
    cmd = paste(gsutil_cmd, "-m cp", local_fname, curr_out_bucket)
    message("Copy to Bucket: ", cmd)
    system(cmd)
  }
}
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t53-cortex_prot-ph_timewise-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t53-cortex_prot-pr_timewise-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t55-gastrocnemius_prot-ph_timewise-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t55-gastrocnemius_prot-pr_timewise-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t58-heart_prot-ph_timewise-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t58-heart_prot-pr_timewise-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t58-heart_prot-ac_timewise-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t58-heart_prot-ub_timewise-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t59-kidney_prot-ph_timewise-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t59-kidney_prot-pr_timewise-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t66-lung_prot-ph_timewise-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t66-lung_prot-pr_timewise-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t68-liver_prot-ph_timewise-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t68-liver_prot-pr_timewise-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t68-liver_prot-ac_timewise-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t68-liver_prot-ub_timewise-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t70-white-adipose_prot-ph_timewise-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t70-white-adipose_prot-pr_timewise-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t53-cortex_prot-ph-protein-corrected_timewise-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t55-gastrocnemius_prot-ph-protein-corrected_timewise-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t58-heart_prot-ph-protein-corrected_timewise-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t58-heart_prot-ac-protein-corrected_timewise-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t58-heart_prot-ub-protein-corrected_timewise-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t59-kidney_prot-ph-protein-corrected_timewise-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t66-lung_prot-ph-protein-corrected_timewise-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t68-liver_prot-ph-protein-corrected_timewise-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t68-liver_prot-ac-protein-corrected_timewise-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t68-liver_prot-ub-protein-corrected_timewise-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t70-white-adipose_prot-ph-protein-corrected_timewise-dea_20210914.txt
# f-test tables
for(i in 1:nrow(tissue_assay_comb)){
  tissue = tissue_assay_comb[i,1]
  assay = tissue_assay_comb[i,2]
  currTable = ftest_res_split_sex[ftest_res_split_sex$tissue == tissue & ftest_res_split_sex$assay==assay,]
  local_fname = file.path(local_data_dir, paste0("pass1b-06_",tissue,"_",assay,"_training-dea_",suffix))
  curr_out_bucket = paste0(out_bucket,str_extract(assay,"prot-.."),"/",dea_folder_name,"/")
  message("Write Local: ", local_fname)
  write.table(currTable,
              file=local_fname,
              sep="\t",
              quote=FALSE,
              row.names = FALSE,
              col.names = TRUE)
  if(upload_to_bucket){
    cmd = paste(gsutil_cmd,"-m cp",local_fname,curr_out_bucket)
    message("\tCopy to Bucket: ", cmd)
    system(cmd)
  }
}
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t53-cortex_prot-ph_training-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t53-cortex_prot-pr_training-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t55-gastrocnemius_prot-ph_training-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t55-gastrocnemius_prot-pr_training-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t58-heart_prot-ph_training-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t58-heart_prot-pr_training-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t58-heart_prot-ac_training-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t58-heart_prot-ub_training-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t59-kidney_prot-ph_training-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t59-kidney_prot-pr_training-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t66-lung_prot-ph_training-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t66-lung_prot-pr_training-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t68-liver_prot-ph_training-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t68-liver_prot-pr_training-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t68-liver_prot-ac_training-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t68-liver_prot-ub_training-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t70-white-adipose_prot-ph_training-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t70-white-adipose_prot-pr_training-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t53-cortex_prot-ph-protein-corrected_training-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t55-gastrocnemius_prot-ph-protein-corrected_training-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t58-heart_prot-ph-protein-corrected_training-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t58-heart_prot-ac-protein-corrected_training-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t58-heart_prot-ub-protein-corrected_training-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t59-kidney_prot-ph-protein-corrected_training-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t66-lung_prot-ph-protein-corrected_training-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t68-liver_prot-ph-protein-corrected_training-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t68-liver_prot-ac-protein-corrected_training-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t68-liver_prot-ub-protein-corrected_training-dea_20210914.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/pass1b-06_t70-white-adipose_prot-ph-protein-corrected_training-dea_20210914.txt

7. Save normalized data to output bucket

Save normalized and batch corrected data

for(dataset_name in names(proteomics_data)){
  
  #Normalized and batch corrected data
  x = proteomics_data[[dataset_name]][["norm_filtered"]][["norm_data_b"]]
  
  #Normalized, batch corrected, and imputed
  x_imp = proteomics_data[[dataset_name]][["norm_filtered"]][["norm_data_b_imp"]]
  
  #Feature annotations
  x_annot = proteomics_data[[dataset_name]]$row_annot
  
  # get the correct row annotation (was filtered by NAs above)
  feature_inds = proteomics_data[[dataset_name]][["norm_filtered"]]$feature_inds
  x_annot = x_annot[feature_inds,]
  
  arr = strsplit(dataset_name,split=",")[[1]]
  curr_ome = tolower(arr[2])
  curr_t = tolower(arr[1])
  out_b = paste0(out_bucket,curr_ome,"/data/")
  
  # add bid and pid
  x_bids = pheno_data$viallabel_data[colnames(x),"bid"]
  x_pids = pheno_data$viallabel_data[colnames(x),"pid"]
  newx = x
  for(j in 1:ncol(newx)){newx[,j] = round(as.numeric(newx[,j]),digits=5)}
  newx = rbind(x_bids,newx)
  newx = rbind(x_pids,newx)
  newx = rbind(colnames(x),newx)
  colnames(newx) = NULL
  
  newx_imp = x_imp
  for(j in 1:ncol(newx_imp)){newx_imp[,j] = round(as.numeric(newx_imp[,j]),digits=5)}
  newx_imp = rbind(x_bids,newx_imp)
  newx_imp = rbind(x_pids,newx_imp)
  newx_imp = rbind(colnames(x_imp),newx_imp)
  colnames(newx_imp) = NULL
  
  # a simple sanity check
  m = newx[-c(1:3),]
  mode(m) = "numeric"
  if(!all(abs(m-x)< 1e-05,na.rm=TRUE)){
    print("Error in parsing the matrix, breaking")
    break
  }
  # a simple sanity check
  m = newx_imp[-c(1:3),]
  mode(m) = "numeric"
  if(!all(abs(m-x)< 1e-05,na.rm=TRUE)){
    print("Error in parsing the matrix, breaking")
    break
  }
  
  rownames(newx)[1:3] = c("viallabel","pid","bid")
  rownames(newx_imp)[1:3] = c("viallabel","pid","bid")
  
  #Save the output to the target bucket
  # Median-MAD normalized TMT ratios
  local_fname = file.path(local_data_dir, 
                          paste0("motrpac_pass1b-06_",
                                 curr_t,"_",curr_ome,
                                 "_med-mad-normalized-logratio.txt"))
  message("Write Local: ", local_fname)
  write.table(newx,
              file=local_fname,
              sep="\t",
              quote=FALSE,
              row.names = TRUE,
              col.names = FALSE)
  if(upload_to_bucket){
    cmd = paste(gsutil_cmd,"cp",local_fname,out_b)
    message("Copy to Bucket: ", cmd)
    system(cmd)    
  }

  # Median-MAD normalized and imputed TMT ratios
  local_fname_imp = file.path(local_data_dir,
                              paste0("motrpac_pass1b-06_",
                                     curr_t,"_",curr_ome,
                                     "_med-mad-normalized-imputed-logratio.txt"))
  message("Write Local: ", local_fname_imp)
  write.table(newx_imp,
              file=local_fname_imp,
              sep="\t",
              quote=FALSE,
              row.names = TRUE,
              col.names = FALSE)
  if(upload_to_bucket){
    cmd = paste(gsutil_cmd,"cp",local_fname_imp,out_b)
    message("Copy to Bucket: ", cmd)
    system(cmd)
  }

  #Feature annotations
  local_fname2 = file.path(local_data_dir, 
                           paste0("motrpac_pass1b-06_",
                                  curr_t,"_",curr_ome,
                                  "_normalized-data-feature-annot.txt"))
  x_annot = rbind(colnames(x_annot),x_annot)
  colnames(x_annot) = NULL
  x_annot = cbind(rownames(x_annot),x_annot)
  rownames(x_annot) = NULL
  x_annot[,1] = as.character(x_annot[,1])
  x_annot[1,1] = "feature_ID"
  message("Write Local: ", local_fname2)
  write.table(x_annot,
              file=local_fname2,
              sep="\t",
              quote=FALSE,
              row.names = FALSE,
              col.names = FALSE)
  if(upload_to_bucket){
    cmd = paste(gsutil_cmd,"cp",local_fname2,out_b)
    message("Copy to Bucket: ", cmd)
    system(cmd)
  }
}
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t53-cortex_prot-ph_med-mad-normalized-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t53-cortex_prot-ph_med-mad-normalized-imputed-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t53-cortex_prot-ph_normalized-data-feature-annot.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t53-cortex_prot-pr_med-mad-normalized-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t53-cortex_prot-pr_med-mad-normalized-imputed-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t53-cortex_prot-pr_normalized-data-feature-annot.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t55-gastrocnemius_prot-ph_med-mad-normalized-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t55-gastrocnemius_prot-ph_med-mad-normalized-imputed-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t55-gastrocnemius_prot-ph_normalized-data-feature-annot.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t55-gastrocnemius_prot-pr_med-mad-normalized-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t55-gastrocnemius_prot-pr_med-mad-normalized-imputed-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t55-gastrocnemius_prot-pr_normalized-data-feature-annot.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t58-heart_prot-ph_med-mad-normalized-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t58-heart_prot-ph_med-mad-normalized-imputed-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t58-heart_prot-ph_normalized-data-feature-annot.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t58-heart_prot-pr_med-mad-normalized-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t58-heart_prot-pr_med-mad-normalized-imputed-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t58-heart_prot-pr_normalized-data-feature-annot.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t58-heart_prot-ac_med-mad-normalized-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t58-heart_prot-ac_med-mad-normalized-imputed-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t58-heart_prot-ac_normalized-data-feature-annot.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t58-heart_prot-ub_med-mad-normalized-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t58-heart_prot-ub_med-mad-normalized-imputed-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t58-heart_prot-ub_normalized-data-feature-annot.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t59-kidney_prot-ph_med-mad-normalized-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t59-kidney_prot-ph_med-mad-normalized-imputed-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t59-kidney_prot-ph_normalized-data-feature-annot.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t59-kidney_prot-pr_med-mad-normalized-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t59-kidney_prot-pr_med-mad-normalized-imputed-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t59-kidney_prot-pr_normalized-data-feature-annot.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t66-lung_prot-ph_med-mad-normalized-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t66-lung_prot-ph_med-mad-normalized-imputed-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t66-lung_prot-ph_normalized-data-feature-annot.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t66-lung_prot-pr_med-mad-normalized-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t66-lung_prot-pr_med-mad-normalized-imputed-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t66-lung_prot-pr_normalized-data-feature-annot.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t68-liver_prot-ph_med-mad-normalized-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t68-liver_prot-ph_med-mad-normalized-imputed-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t68-liver_prot-ph_normalized-data-feature-annot.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t68-liver_prot-pr_med-mad-normalized-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t68-liver_prot-pr_med-mad-normalized-imputed-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t68-liver_prot-pr_normalized-data-feature-annot.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t68-liver_prot-ac_med-mad-normalized-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t68-liver_prot-ac_med-mad-normalized-imputed-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t68-liver_prot-ac_normalized-data-feature-annot.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t68-liver_prot-ub_med-mad-normalized-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t68-liver_prot-ub_med-mad-normalized-imputed-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t68-liver_prot-ub_normalized-data-feature-annot.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t70-white-adipose_prot-ph_med-mad-normalized-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t70-white-adipose_prot-ph_med-mad-normalized-imputed-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t70-white-adipose_prot-ph_normalized-data-feature-annot.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t70-white-adipose_prot-pr_med-mad-normalized-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t70-white-adipose_prot-pr_med-mad-normalized-imputed-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t70-white-adipose_prot-pr_normalized-data-feature-annot.txt

Save normalized, batch corrected, and protein-corrected data. Only applicable for PTM datasets.

#Perform DEA only on PTM data
ptm_datasets <- grep("prot-pr",names(proteomics_data),value = T,invert = T)

for(dataset_name in ptm_datasets){
  
  #Normalized, batch corrected, and protein corrected
  x_prot_cor = proteomics_data[[dataset_name]][["norm_filtered"]][["norm_data_b_protein_corrected"]]

  #Get ome and tissue values from dataset name
  arr = strsplit(dataset_name,split=",")[[1]]
  curr_ome = tolower(arr[2])
  curr_t = tolower(arr[1])
  out_b = paste0(out_bucket,curr_ome,"/data/")
  
  #Add bid and pid
  x_bids = pheno_data$viallabel_data[colnames(x_prot_cor),"bid"]
  x_pids = pheno_data$viallabel_data[colnames(x_prot_cor),"pid"]
  newx_prot_cor = x_prot_cor
  for(j in 1:ncol(newx_prot_cor)){newx_prot_cor[,j] = round(as.numeric(newx_prot_cor[,j]),digits=5)}
  newx_prot_cor = rbind(x_bids,newx_prot_cor)
  newx_prot_cor = rbind(x_pids,newx_prot_cor)
  newx_prot_cor = rbind(colnames(x_prot_cor),newx_prot_cor)
  colnames(newx_prot_cor) = NULL
  rownames(newx_prot_cor)[1:3] = c("viallabel","pid","bid")
  
  # save the output to the target bucket
  local_fname = file.path(local_data_dir, 
                          paste0("motrpac_pass1b-06_",
                                 curr_t,"_",curr_ome,
                                 "_med-mad-normalized-protein-corrected-logratio.txt"))
  message("Write Local: ", local_fname)
  write.table(newx_prot_cor,
              file=local_fname,
              sep="\t",
              quote=FALSE,
              row.names = TRUE,
              col.names = FALSE)
  if(upload_to_bucket){
    cmd = paste(gsutil_cmd,"cp",local_fname,out_b)
    message("Copy to Bucket: ", cmd)
    system(cmd)
  }
}
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t53-cortex_prot-ph_med-mad-normalized-protein-corrected-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t55-gastrocnemius_prot-ph_med-mad-normalized-protein-corrected-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t58-heart_prot-ph_med-mad-normalized-protein-corrected-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t58-heart_prot-ac_med-mad-normalized-protein-corrected-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t58-heart_prot-ub_med-mad-normalized-protein-corrected-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t59-kidney_prot-ph_med-mad-normalized-protein-corrected-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t66-lung_prot-ph_med-mad-normalized-protein-corrected-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t68-liver_prot-ph_med-mad-normalized-protein-corrected-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t68-liver_prot-ac_med-mad-normalized-protein-corrected-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t68-liver_prot-ub_med-mad-normalized-protein-corrected-logratio.txt
## Write Local: ~/github/MoTrPAC/motrpac-bic-norm-qc/data/proteomics/DF_PASS_20210731/motrpac_pass1b-06_t70-white-adipose_prot-ph_med-mad-normalized-protein-corrected-logratio.txt